#install numerapi
!pip install --upgrade pip
!pip install --upgrade numerapi
Collecting pip Using cached pip-22.0.4-py3-none-any.whl (2.1 MB) ERROR: causalml 0.9.0 has requirement numpy<1.19.0,>=0.16.0, but you'll have numpy 1.19.1 which is incompatible. ERROR: causalml 0.9.0 has requirement scipy==1.4.1, but you'll have scipy 1.6.0 which is incompatible. Installing collected packages: pip Successfully installed pip-22.0.4 Requirement already up-to-date: numerapi in /home/jacobstahl/.local/lib/python3.8/site-packages (2.11.0) Requirement already satisfied, skipping upgrade: python-dateutil in /usr/lib/python3/dist-packages (from numerapi) (2.7.3) Requirement already satisfied, skipping upgrade: requests in /home/jacobstahl/.local/lib/python3.8/site-packages (from numerapi) (2.24.0) Requirement already satisfied, skipping upgrade: pytz in /usr/lib/python3/dist-packages (from numerapi) (2019.3) Requirement already satisfied, skipping upgrade: pandas>=1.1.0 in /home/jacobstahl/.local/lib/python3.8/site-packages (from numerapi) (1.2.1) Requirement already satisfied, skipping upgrade: tqdm>=4.29.1 in /home/jacobstahl/.local/lib/python3.8/site-packages (from numerapi) (4.51.0) Requirement already satisfied, skipping upgrade: click>=7.0 in /usr/lib/python3/dist-packages (from numerapi) (7.0) Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /home/jacobstahl/.local/lib/python3.8/site-packages (from requests->numerapi) (2020.11.8) Requirement already satisfied, skipping upgrade: idna<3,>=2.5 in /usr/lib/python3/dist-packages (from requests->numerapi) (2.8) Requirement already satisfied, skipping upgrade: chardet<4,>=3.0.2 in /usr/lib/python3/dist-packages (from requests->numerapi) (3.0.4) Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/lib/python3/dist-packages (from requests->numerapi) (1.25.8) Requirement already satisfied, skipping upgrade: numpy>=1.16.5 in /home/jacobstahl/.local/lib/python3.8/site-packages (from pandas>=1.1.0->numerapi) (1.19.1)
import pandas as pd
import numpy as np
import numerapi
import os
# make /data directory if it doesn't exist
if not os.path.exists("data"):
os.mkdir("data")
# download data using numerapi
# https://pypi.org/project/numerapi/
napi = numerapi.NumerAPI(verbosity="info")
napi.download_dataset("numerai_training_data.parquet", "data/numerai_training_data.parquet")
2022-05-01 19:34:54,149 INFO numerapi.utils: target file already exists 2022-05-01 19:34:54,149 INFO numerapi.utils: download complete
training_set = pd.read_parquet("data/numerai_training_data.parquet")
training_set.head()
| era | data_type | feature_dichasial_hammier_spawner | feature_rheumy_epistemic_prancer | feature_pert_performative_hormuz | feature_hillier_unpitied_theobromine | feature_perigean_bewitching_thruster | feature_renegade_undomestic_milord | feature_koranic_rude_corf | feature_demisable_expiring_millepede | ... | target_paul_20 | target_paul_60 | target_george_20 | target_george_60 | target_william_20 | target_william_60 | target_arthur_20 | target_arthur_60 | target_thomas_20 | target_thomas_60 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | |||||||||||||||||||||
| n003bba8a98662e4 | 0001 | train | 1.0 | 0.50 | 1.00 | 1.00 | 0.00 | 0.00 | 1.00 | 1.00 | ... | 0.25 | 0.25 | 0.25 | 0.00 | 0.166667 | 0.000000 | 0.166667 | 0.000000 | 0.166667 | 0.000000 |
| n003bee128c2fcfc | 0001 | train | 0.5 | 1.00 | 0.25 | 0.75 | 0.00 | 0.75 | 0.50 | 0.75 | ... | 1.00 | 1.00 | 1.00 | 1.00 | 0.833333 | 0.666667 | 0.833333 | 0.666667 | 0.833333 | 0.666667 |
| n0048ac83aff7194 | 0001 | train | 0.5 | 0.25 | 0.75 | 0.00 | 0.75 | 0.00 | 0.75 | 0.75 | ... | 0.50 | 0.25 | 0.25 | 0.25 | 0.500000 | 0.333333 | 0.500000 | 0.333333 | 0.500000 | 0.333333 |
| n00691bec80d3e02 | 0001 | train | 1.0 | 0.50 | 0.50 | 0.75 | 0.00 | 1.00 | 0.25 | 1.00 | ... | 0.50 | 0.50 | 0.50 | 0.50 | 0.666667 | 0.500000 | 0.500000 | 0.500000 | 0.666667 | 0.500000 |
| n00b8720a2fdc4f2 | 0001 | train | 1.0 | 0.75 | 1.00 | 1.00 | 0.00 | 0.00 | 1.00 | 0.50 | ... | 0.50 | 0.50 | 0.50 | 0.50 | 0.666667 | 0.500000 | 0.500000 | 0.500000 | 0.666667 | 0.500000 |
5 rows × 1073 columns
print("number of eras:", len(training_set.era.unique()))
print("number of rows:", len(training_set))
feature_names = [f for f in training_set.columns if "feature_" in f]
training_set = training_set.sample(100000) # subsample to speed up and save memory
number of eras: 574 number of rows: 2412105
# train linear regression model with sklearn as a baseline
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(training_set[feature_names], training_set["target"])
LinearRegression()
# download validation set for testing our models
napi.download_dataset("numerai_validation_data.parquet", "data/numerai_validation_data.parquet")
# load validation set
validation_set = pd.read_parquet("data/numerai_validation_data.parquet")
2022-05-01 19:35:05,998 INFO numerapi.utils: target file already exists 2022-05-01 19:35:05,999 INFO numerapi.utils: download complete
predictions = linear_model.predict(validation_set[feature_names])
# validation correlation by era
validation_set["linear_prediction"] = predictions
era_correlations = validation_set.groupby("era").apply(
lambda era: np.corrcoef(era["linear_prediction"], era["target"])[0, 1]
)
!pip install --upgrade plotly
import plotly.express as px
# plot era correlations bar graph, each bar is a correlation between prediction and target by era
fig = px.bar(era_correlations)
# y axis is correlation, x axis is era
fig.update_layout(title="Correlation between prediction and target by era")
fig.show(renderer="notebook")
Requirement already up-to-date: plotly in /home/jacobstahl/.local/lib/python3.8/site-packages (5.7.0) Requirement already satisfied, skipping upgrade: tenacity>=6.2.0 in /home/jacobstahl/.local/lib/python3.8/site-packages (from plotly) (8.0.1) Requirement already satisfied, skipping upgrade: six in /home/jacobstahl/.local/lib/python3.8/site-packages (from plotly) (1.15.0)
# cumulative sum of era correlations
# era correlations are used to calculate returns
# taking the cumulative sum of era correlations can estimate the expected returns without compounding
cum_sum = np.cumsum(era_correlations)
fig = px.bar(cum_sum)
fig.update_layout(title="Cumulative sum of era correlations")
fig.show(renderer="notebook")
# lets train a catboost model to see if it can beat the linear regression model
# https://catboost.ai/
!pip install --upgrade catboost
import catboost
cat_model = catboost.CatBoostRegressor(
iterations=1000,
learning_rate=0.01,
depth=6,
)
cat_model.fit(training_set[feature_names], training_set["target"], verbose=False)
predictions = cat_model.predict(validation_set[feature_names])
# validation correlation by era
validation_set["cat_prediction"] = predictions
era_correlations = validation_set.groupby("era").apply(
lambda era: np.corrcoef(era["cat_prediction"], era["target"])[0, 1]
)
import plotly.express as px
# plot era correlations bar graph
fig = px.bar(era_correlations)
# y axis is correlation, x axis is era
fig.update_layout(title="Correlation between prediction and target by era")
fig.show(renderer="notebook")
Requirement already up-to-date: catboost in /home/jacobstahl/.local/lib/python3.8/site-packages (1.0.5) Requirement already satisfied, skipping upgrade: pandas>=0.24.0 in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (1.2.1) Requirement already satisfied, skipping upgrade: matplotlib in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (3.3.4) Requirement already satisfied, skipping upgrade: graphviz in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (0.17) Requirement already satisfied, skipping upgrade: plotly in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (5.7.0) Requirement already satisfied, skipping upgrade: numpy>=1.16.0 in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (1.19.1) Requirement already satisfied, skipping upgrade: six in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (1.15.0) Requirement already satisfied, skipping upgrade: scipy in /home/jacobstahl/.local/lib/python3.8/site-packages (from catboost) (1.6.0) Requirement already satisfied, skipping upgrade: python-dateutil>=2.7.3 in /usr/lib/python3/dist-packages (from pandas>=0.24.0->catboost) (2.7.3) Requirement already satisfied, skipping upgrade: pytz>=2017.3 in /usr/lib/python3/dist-packages (from pandas>=0.24.0->catboost) (2019.3) Requirement already satisfied, skipping upgrade: cycler>=0.10 in /home/jacobstahl/.local/lib/python3.8/site-packages (from matplotlib->catboost) (0.10.0) Requirement already satisfied, skipping upgrade: kiwisolver>=1.0.1 in /home/jacobstahl/.local/lib/python3.8/site-packages (from matplotlib->catboost) (1.3.1) Requirement already satisfied, skipping upgrade: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /home/jacobstahl/.local/lib/python3.8/site-packages (from matplotlib->catboost) (2.4.7) Requirement already satisfied, skipping upgrade: pillow>=6.2.0 in /home/jacobstahl/.local/lib/python3.8/site-packages (from matplotlib->catboost) (7.2.0) Requirement already satisfied, skipping upgrade: tenacity>=6.2.0 in /home/jacobstahl/.local/lib/python3.8/site-packages (from plotly->catboost) (8.0.1)
# cumulative sum of era correlations
# we can see that the catboost model is better than the linear regression model
cum_sum = np.cumsum(era_correlations)
fig = px.bar(cum_sum)
fig.update_layout(title="Cumulative sum of era correlations")
fig.show(renderer="notebook")
import torch
from torch import nn
# train 2 layer neural network to predict target
# https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html
model = nn.Sequential(
nn.Linear(len(feature_names), 256),
nn.ReLU(),
nn.Linear(256, 256),
nn.ReLU(),
nn.Linear(256, 1),
)
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # we use a lower learning rate
criterion = nn.MSELoss()
NUM_ITERATIONS = 300
BATCH_SIZE = 2048
# copy training set to CPU tensor
training_features = torch.tensor(training_set[feature_names].values).float()
training_target = torch.tensor(training_set["target"].values).float()
# copy validation set to CPU tensor
validation_features = torch.tensor(validation_set[feature_names].values, dtype=torch.float32)
validation_target = torch.tensor(validation_set["target"].values, dtype=torch.float32)
validation_corrs = []
best_model = model
record_validation_corr = -9999
# let each batch be a random sample of the training set
# let batchs be a list of batchs
batches = [np.random.choice(training_set.shape[0], size=BATCH_SIZE, replace=False) for _ in range(NUM_ITERATIONS)]
for iteration, batch in enumerate(batches):
model.cuda()
X, y = training_features[batch], training_target[batch]
X = X.cuda()
y = y.cuda()
# reshape batch to fit model
X = X.view(X.shape[0], -1)
# forward pass
outputs = model(X)
# calculate loss
loss = criterion(outputs.squeeze(), y.squeeze())
# backward pass
optimizer.zero_grad()
loss.backward()
# update weights
optimizer.step()
# calculate mean era correlation in validation set
model.cpu()
predictions = model(torch.tensor(validation_set[feature_names].values, dtype=torch.float32)).detach().cpu().numpy()
validation_set["prediction"] = predictions
era_correlations = validation_set.groupby("era").apply(
lambda era: np.corrcoef(era["prediction"], era["target"])[0, 1]
)
validation_corrs.append(era_correlations.mean())
print(f"iteration {iteration} loss {loss.item()} mean era correlation {era_correlations.mean()}", end="\r")
if era_correlations.mean() > record_validation_corr:
record_validation_corr = era_correlations.mean()
best_model = model
# reset model parameters if val corr doesn't improve, or its NaN
# sometimes the model will get stuck when it starts training and fails to improve
if iteration > 10 and era_correlations.mean() < 0 or era_correlations.mean() != era_correlations.mean():
model = nn.Sequential(
nn.Linear(len(feature_names), 256),
nn.ReLU(),
nn.Linear(256, 256),
nn.ReLU(),
nn.Linear(256, 1),
)
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()
iteration 299 loss 0.050988636910915375 mean era correlation 0.0145012277351259386
fig = px.bar(validation_corrs)
fig.update_layout(title="Mean era correlation on validation set by iteration")
# y axis is mean era correlation, x axis is iteration
fig.update_xaxes(title="Iteration")
fig.update_yaxes(title="Mean era correlation")
fig.show(renderer="notebook")
# calculate era correlation in validation set use the best model, compare it to the linear regression model and the catboost model with a line plot
best_model.cpu()
predictions = best_model(torch.tensor(validation_set[feature_names].values, dtype=torch.float32)).detach().cpu().numpy()
validation_set["nn_prediction"] = predictions
nn_era_correlations = validation_set.groupby("era").apply(
lambda era: np.corrcoef(era["nn_prediction"], era["target"])[0, 1]
)
nn_cum_sum = np.cumsum(nn_era_correlations)
cat_model_correlations = validation_set.groupby("era").apply(
lambda era: np.corrcoef(era["cat_prediction"], era["target"])[0, 1]
)
cat_model_cum_sum = np.cumsum(cat_model_correlations)
linear_model_correlations = validation_set.groupby("era").apply(
lambda era: np.corrcoef(era["linear_prediction"], era["target"])[0, 1]
)
linear_model_cum_sum = np.cumsum(linear_model_correlations)
# ensamble nn and catboost models by normalizing the predictions of the two models
# add them together and calculate the mean correlation
normalized_nn_predictions = (validation_set["nn_prediction"] - validation_set["nn_prediction"].mean()) / validation_set["nn_prediction"].std()
normalized_cat_predictions = (validation_set["cat_prediction"] - validation_set["cat_prediction"].mean()) / validation_set["cat_prediction"].std()
validation_set["ensamble_prediction"] = (normalized_nn_predictions + normalized_cat_predictions) / 2
ensamble_era_correlations = validation_set.groupby("era").apply(
lambda era: np.corrcoef(era["ensamble_prediction"], era["target"])[0, 1]
)
ensamble_cum_sum = np.cumsum(ensamble_era_correlations)
2022-05-01 21:12:36,471 INFO numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8. 2022-05-01 21:12:36,471 INFO numexpr.utils: NumExpr defaulting to 8 threads.
import plotly
fig = plotly.graph_objects.Figure()
# add nn, linear regression and catboost predictions, cum sum
fig.add_scatter(x=np.arange(len(nn_era_correlations)), y=nn_cum_sum, name="Neural Network")
fig.add_scatter(x=np.arange(len(cat_model_correlations)), y=cat_model_cum_sum, name="CatBoost")
fig.add_scatter(x=np.arange(len(linear_model_correlations)), y=linear_model_cum_sum, name="Linear Regression")
fig.add_scatter(x=np.arange(len(ensamble_era_correlations)), y=ensamble_cum_sum, name="Ensamble")
fig.update_layout(title="Cumulative Correlation between prediction and target by era")
fig.show(renderer="notebook")